#!/usr/bin/env python3
'''
Author: Paschalis Agapitos
Project: Mestizajes

Provides corpus-construction utilities for the embedding pipeline by loading
filtered link data, canonicalizing target link text, and organizing links into
grouped corpora at both the century and person levels. These structured
outputs support consistent vocabulary creation and downstream embedding
generation.
'''

from pathlib import Path
from typing import Dict, List, Tuple, Optional

import pandas as pd


class LinkCorpusBuilder:
    """Build lists of canonized links grouped by century and by person."""

    def __init__(self, parquet_path: Path):
        self.path = Path(parquet_path)
        self.df: Optional[pd.DataFrame] = None

    def load(self) -> None:
        df = pd.read_parquet(self.path, engine="pyarrow").copy()
        df["Target"] = df["Target"].astype(str).str.strip()
        self.df = df

    def links_by_century(self) -> pd.Series:
        if self.df is None:
            raise ValueError("Call load() first.")
        centuries = self.df["century_of_birth"].astype(int)
        targets = self.df["Target"]
        grp = (
            pd.DataFrame({"century": centuries, "Target": targets})
            .groupby("century")["Target"]
            .apply(lambda col: [self._canonicalize(x) for x in col if x])
        )
        return grp  # index: int century → List[str]

    def links_by_person(self) -> pd.Series:
        if self.df is None:
            raise ValueError("Call load() first.")
        grp = (
            self.df.groupby("Source")["Target"]
            .apply(lambda col: [self._canonicalize(x) for x in col if x])
        )
        return grp  # index: person name → List[str]

    @staticmethod
    def _canonicalize(s: str) -> str:
        """Canonicalize a string by normalizing whitespace."""
        import re
        return re.sub(r"[\s_]+", " ", str(s).strip())

    @staticmethod
    def build_vocab(series_of_lists: pd.Series) -> Tuple[List[str], Dict[str, int]]:
        uniq = sorted(set(x for lst in series_of_lists for x in lst))
        return uniq, {s: i for i, s in enumerate(uniq)}


def build_corpora_from_parquet(parquet_path: Path):
    """
    Build corpora from the parquet file containing filtered data.
    
    Returns:
        builder: LinkCorpusBuilder instance
        links_by_century: Series with links grouped by century
        links_by_person: Series with links grouped by person
    """
    builder = LinkCorpusBuilder(parquet_path)
    builder.load()
    links_by_century = builder.links_by_century()
    links_by_person = builder.links_by_person()
    return builder, links_by_century, links_by_person
